Imports¶

In [1]:
from sklearn.preprocessing import (FunctionTransformer, 
                                   QuantileTransformer,
                                   MinMaxScaler,
                                   RobustScaler,
                                   StandardScaler,
                                   KBinsDiscretizer,
                                  )
In [31]:
%matplotlib inline
from pathlib import Path
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image
import PIL
import cv2
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer, minmax_scale


import glob

from keras.applications import Xception
from keras.applications.xception import preprocess_input

Carregar Imagens¶

In [3]:
def transform_image(arr, f, **kwargs):
    arr = np.array(arr)
    assert arr.ndim == 2
    transformed_images = []
    for i in range(arr.shape[1]):
        
        transformed_images_i = []
        
        for image in arr[:,i]:
            image_shape = image.shape
            transformed = f(image.flatten(), **kwargs)            
            #shape preserving tranfsormation
            if isinstance(transformed, np.ndarray):
                if np.prod(transformed.shape) == np.prod(image_shape):
                    transformed = transformed.reshape(image_shape)
                else:
                    pass
            
            transformed_images_i.append(transformed)
        
        transformed_images.append(transformed_images_i)
    
    images = np.array(transformed_images).T
    return images

class ImageTransformer(FunctionTransformer):
    def __init__(self, function):
        self.function = function
        super().__init__(lambda x: transform_image(x, function))
In [4]:
def get_histogram_stat(img, stat = np.mean):    
    #assert is black and white and 2d
    assert img.ndim == 2
    return stat(img)

def to_black_and_white(img, dtype = np.int16, asarray = True, normalize_range = (0,255)):
    """
    transforms the image to black and white one, with 2 channels only.
    its possible to normalize and cast, if an array is returned
    """
    if isinstance(img, PIL.BmpImagePlugin.BmpImageFile):
        arr = np.array(img.convert('L'))
    else:
        arr = img
    
    if normalize_range is None:
        arr = arr.astype(dtype)
    else:
        arr =  minmax_scale(arr, normalize_range, axis = None).astype(dtype)
        
    if asarray:
        return arr
    else:
        return Image.fromarray(arr)
        

def get_histogram_stat_vector(images, stat = np.mean, normalization_function = to_black_and_white):
    """
    allows vectorized operations on ImageLoader instance or collection of images/np.arrays
    """
    if isinstance(images, ImageLoader):
        results = images.map(lambda img: stat(normalization_function(img)))
    else:
        results = list(map(lambda img: stat(normalization_function(img)), images))
        
    return results
In [5]:
from PIL import Image, ImageOps


def padding(img, expected_size):
    desired_size = expected_size
    delta_width = desired_size - img.size[0]
    delta_height = desired_size - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)


def resize_with_padding(img, expected_size):
    img.thumbnail((expected_size[0], expected_size[1]))
    # print(img.size)
    delta_width = expected_size[0] - img.size[0]
    delta_height = expected_size[1] - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)


def gray_to_rgb(img):
    if img.ndim == 3:
        img = img.reshape((*img.shape[:-1]))
    
    img2 = np.zeros( ( np.array(img).shape[0], np.array(img).shape[1], 3 ) )
    img2[:,:,0] = img # same value in each channel
    img2[:,:,1] = img
    img2[:,:,2] = img
    return img2
In [40]:
class ImageLoader():
    """
    loads images from path given a list of wildcards
    """
    def __init__(self, root_dir, mask_extension = ".bmp", image_extension = ".bmp"):
        
        
        label_paths = Path(root_dir).iterdir()
        #set labels based on parent folder
        all_items = []
        for label_path in label_paths:
            label = str(label_path).split("/")[-1]
            mask_paths = [str(i) for i in set(label_path.glob("*_mask*"))]
            image_paths = []
            new_mask_paths = []
            for i in range(len(mask_paths)):
                mask_path = mask_paths[i]
                im_paths = list(Path(label_path).rglob("*" + mask_path.split("/")[-1].split("_mask")[0] +"*"))
                im_paths = [i for i in im_paths if not "_mask" in str(i)]
                if len(im_paths) > 0:
                    
                    new_mask_paths.append(mask_path)
                    image_paths.append(str(im_paths[0]))
                else:
                    pass
            #
            labels = [label]*len(image_paths)
            items = list(zip(image_paths, new_mask_paths, labels))
            all_items += items
        
        self.items = all_items
        return
        
    def __len__(self):
        return len(self.items)
    def __getitem__(self, idx):
        return {"image":Image.open(self.items[idx][0]), "mask": Image.open(self.items[idx][1]), "label":self.items[idx][2]}        
    
    def get_image_array(self, index, size = None, to_rgb = False, normalize = False, normalize_range = (0,255)):

        image = Image.open(self.items[index][0])
        if normalize:
            image = np.array(image)
            shape = image.shape
            image = minmax_scale(image.ravel(), feature_range=normalize_range).reshape(shape)
            image = Image.fromarray(image)
        
        if to_rgb:           
            image = image.convert("RGB")

        if not size is None:
            image = resize_with_padding(image, size)
    
        return np.array(image)
    
    def get_mask_array(self, index):
        mask = Image.open(self.items[index][1])
        mask_array = np.array(mask)
        assert len(np.unique(mask_array.flatten())) <= 2, "mask has more than two values"
        mask_array = (mask_array - mask_array.min())/(mask_array.max()-mask_array.min())
        return mask_array.astype(bool)
    
    def get_label(self, index):
        return self.items[index][2]
    
    def get_masked_image(self, index):
        return np.where(self.get_mask_array(index), self.get_image_array(index), 0)
    
    def plot_masked_image(self, index, alpha = 0.5):
        plt.imshow(self.get_image_array(index), cmap = "gray")
        plt.imshow(self.get_masked_image(index), alpha = alpha, cmap = "gray")
        return
    
    def get_masked_flat_image(self, index, return_index = False):
        image_flat = self.get_image_array(index).flatten()
        mask_flat = self.get_mask_array(index).flatten()        
        image_flat = image_flat[mask_flat]
        if not return_index:
            return image_flat
        else:
            return image_flat, mask_flat.nonzero()[0]
            
    def get_image_id(self, index):
        return self.items[index][0].split('/')[-1].split("_FLAIR")[0]
    
    def get_flair_id(self, index):
        return self.items[index][0].split('/')[-1].split("_FLAIR")[-1].split('.')[0]
    
    def map(self, function, attribute = 'image'):
        assert attribute in (None, "image","label","mask")
        if not attribute is None:
            vals = [i[attribute] for i in self]
        else:
            vals = [i for i in self]
        
        results = list(map(function, vals))
        return results
    
    
In [41]:
loader = ImageLoader("../Train")
loader_test = ImageLoader("../Test/")
In [42]:
#all(loader.map(lambda x: np.array(x).ndim == 2)), all(loader_test.map(lambda x: np.array(x).ndim == 2))
In [43]:
plt.imshow(loader_test.get_image_array(0, size = (512,512), to_rgb = True))
loader_test.get_image_array(0, size = (512,512), to_rgb = True).shape
Out[43]:
(512, 512, 3)
In [44]:
loader_test.plot_masked_image(100, alpha = 0.8)
In [45]:
loader.get_mask_array(212)
Out[45]:
array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

Extrai Atributos¶

In [46]:
from keras.applications import EfficientNetV2S
In [47]:
from tensorflow.keras.preprocessing import image
import tensorflow as tf
In [48]:
image_size = (299,299)
pixel_range = (0,255)
In [15]:
feature_extractor = EfficientNetV2S(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(*image_size,3),
    pooling="max",
    #classes=1000,
    #classifier_activation="softmax",
    include_preprocessing=True,
)
2022-07-06 00:51:49.137094: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/alan/.local/lib/python3.8/site-packages/cv2/../../lib64:
2022-07-06 00:51:49.137143: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-06 00:51:49.137173: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (alan-Lenovo-ideapad-310-14ISK): /proc/driver/nvidia/version does not exist
2022-07-06 00:51:49.137480: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [16]:
ids_train = np.array([loader.get_image_id(i) for i in range(len(loader))])
ids_test = np.array([loader_test.get_image_id(i) for i in range(len(loader_test))])
ids_all = np.hstack([ids_train, ids_test])
In [17]:
flairs_train = np.array([loader.get_flair_id(i) for i in range(len(loader))])
flairs_test = np.array([loader_test.get_flair_id(i) for i in range(len(loader_test))])
flairs_all = np.hstack([flairs_train, flairs_test])
In [18]:
labels_train = np.array([i["label"] for i in loader])
labels_test = np.array([i["label"] for i in loader_test])
labels_all = np.hstack([labels_train, labels_test])
In [98]:
import pandas as pd
id_df = pd.DataFrame(ids_all, columns = ["id"])
id_df["flair"] = flairs_all.astype(int)
id_df["label"] = labels_all
In [154]:
images_train = [
    loader.get_image_array(
        i,
        image_size,
        to_rgb = True,
        normalize = True,
        normalize_range=pixel_range
    ) for i in range(len(loader))
]
In [155]:
images_test = [
    loader_test.get_image_array(
        i,
        image_size,
        to_rgb = True,
        normalize = True,
        normalize_range=pixel_range
    ) for i in range(len(loader_test))
]
In [156]:
images_train = np.array(images_train)
In [157]:
images_test = np.array(images_test)
In [24]:
features_train = feature_extractor.predict(images_train)
2022-07-06 00:52:05.347622: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 308701653 exceeds 10% of free system memory.
36/36 [==============================] - 177s 5s/step
In [25]:
features_test = feature_extractor.predict(images_test)
29/29 [==============================] - 149s 5s/step

Plot do manifold (espaço reduzido) de atributos¶

Completamente não supervisionado¶

  • é possível ver que de forma completamente não supervisionada, as features extraidas são muito competentes em separar as classes
  • é possível ver que os casos de SLE são mais parecidos com casos de AVC, ainda que existam alguns casos que se afastam do padrão, na região populada apenas por SLE

Checa viés de flair¶

Checamos aqui se alguma classe possui uma probabilidade maior para flairs específicos. Isso pode fazer com que o modelo aprenda a discriminar a classe baseando-se no flair, o que pode ser ruim para seu poder de generalização.

Foi observado que para EM, os flairs são maiores enquanto os flairs para AVC e SLE possuem distribuição semelhante. Esse viés provavelmente ocorre no momento da coleta dos dados, em que o responsável pela imagem limita os flairs de acordo com o flair anterior

In [127]:
#checa viés de flair
id_df.query('label != "Test"').groupby("label").apply(lambda x: sns.distplot(x["flair"], label = x["label"].iloc[0]))
plt.legend()
plt.title("distribuição de flairs por label")
plt.savefig("./images/vies_de_flair.png", bbox_inches= "tight")
/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

In [26]:
import umap
In [27]:
reducer = umap.UMAP()
In [28]:
features_all = np.vstack([features_train, features_test])
labels_all = np.hstack([labels_train, labels_test])
In [29]:
embs = reducer.fit_transform(features_all)

Por label¶

In [53]:
import plotly.express as px
In [54]:
px.scatter(x = embs[:,0], y = embs[:,1], color = id_df["label"])

Por flair¶

é possível ver que as features extraídas também são capazes de segmentar as flairs por similridade

In [73]:
fig = px.scatter(x = embs[:,0], y = embs[:,1], color = id_df["flair"].astype(int), symbol = id_df["label"],
          width = 1000,
          height = 600
          )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))

Espaço escalado por estimator Lasso para reforçar esparsidade¶

é possível ver que a separação é ainda mais forte

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin, BaseEstimator
In [75]:
class LinearScaledEmbeddings(TransformerMixin, BaseEstimator):
    
    def __init__(self, estimator):
        self.estimator = estimator
    
    def fit(self, X, y = None,**kwawrgs):
        
        self.scaler = QuantileTransformer().fit(X)
        X = self.scaler.transform(X)
        self.estimator.fit(X, y,**kwawrgs)
        return self
    
    def transform(self, X, **umap_kwargs):
        #X = self.scaler.transform(X)
        X = self.estimator.coef_*X
        X = umap.UMAP(**umap_kwargs).fit_transform(X)
        return X
In [76]:
estimator = LogisticRegression(penalty = 'l1', solver = "saga")
scaler = LinearScaledEmbeddings(estimator).fit(features_train, labels_train)
/home/alan/.local/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

In [143]:
id_df["lasso_proba_max"] = scaler.estimator.predict_proba(features_all).max(1) - 0.05*(np.random.random(id_df.shape[0]))
In [144]:
#checa viés de flair
id_df.query('label != "Test"').groupby("label").apply(lambda x: sns.distplot(x["lasso_proba_max"], label = x["label"].iloc[0]))
plt.legend()
plt.title("distribuição de max_proba por label")
plt.ylim(1)
plt.savefig("./images/probas_lasso.png", bbox_inches= "tight")
/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

In [131]:
sns.jointplot(id_df["lasso_proba_max"], id_df["flair"].astype(int), alpha = 0.2, hue = id_df["label"])
/home/alan/.local/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning:

Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[131]:
<seaborn.axisgrid.JointGrid at 0x7f879eb3e760>
In [86]:
from sklearn.metrics import auc, roc_auc_score, roc_curve, classification_report
In [87]:
roc_auc_score(labels_train, scaler.estimator.predict_proba(features_train)[:,1], labels = scaler.estimator.classes_)
Out[87]:
0.9985583539355263
In [88]:
lasso_embs = scaler.transform(features_all)
In [91]:
fig = px.scatter(x = lasso_embs[:,0], y = lasso_embs[:,1], color = id_df["lasso_proba_max"], symbol = id_df["label"],
          width = 1000,
          height = 600
          )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))
In [97]:
fig = px.scatter(x = lasso_embs[:,0], y = lasso_embs[:,1], color = id_df["flair"].astype(float), symbol = id_df["label"],
          width = 1000,
          height = 600
          )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))
In [145]:
fig = px.scatter(x = lasso_embs[:,0], y = lasso_embs[:,1], color = id_df["label"],
          width = 1000,
          height = 600
          )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))

Espaço escalado por estimator RandomForest (Não linear)¶

In [99]:
class ForestScaledEmbeddings(TransformerMixin, BaseEstimator):
    
    def __init__(self, estimator):
        self.estimator = estimator
    
    def fit(self, X, y = None,**kwawrgs):                
        self.estimator.fit(X, y,**kwawrgs)
        return self
    
    def transform(self, X, metric = "hamming", **umap_kwargs):
        #X = self.scaler.transform(X)
        X = self.estimator.apply(X)
        X = umap.UMAP(metric = metric, **umap_kwargs).fit_transform(X)
        return X
In [100]:
from sklearn.ensemble import RandomForestClassifier, RandomTreesEmbedding
In [101]:
estimator = RandomForestClassifier(min_samples_leaf=10)
forest_scaler = ForestScaledEmbeddings(estimator)
In [102]:
forest_scaler.fit(features_train, labels_train)
Out[102]:
ForestScaledEmbeddings(estimator=RandomForestClassifier(min_samples_leaf=10))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ForestScaledEmbeddings(estimator=RandomForestClassifier(min_samples_leaf=10))
RandomForestClassifier(min_samples_leaf=10)
RandomForestClassifier(min_samples_leaf=10)
In [114]:
forest_embs = forest_scaler.transform(features_all, metric = "hamming")
/home/alan/.local/lib/python3.8/site-packages/umap/umap_.py:1802: UserWarning:

gradient function is not yet implemented for hamming distance metric; inverse_transform will be unavailable

In [115]:
id_df["forest_proba_max"] = forest_scaler.estimator.predict_proba(features_all).max(1)
In [119]:
fig = px.scatter(x = forest_embs[:,0], y = forest_embs[:,1], color = id_df["forest_proba_max"], symbol = id_df["label"],
                 width = 1000,
                 height = 600,
                 title = "Embeddings UMAP de nós terminais de RandomForest, cor representa probabilidade do estimador"
                )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))
In [120]:
fig = px.scatter(x = forest_embs[:,0], y = forest_embs[:,1], color = id_df["flair"].astype(float), symbol = id_df["label"],
                 width = 1000,
                 height = 600,
                 title = "Embeddings UMAP de nós terminais de RandomForest, core repesenta flair"
          )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))
In [147]:
fig = px.scatter(x = forest_embs[:,0], y = forest_embs[:,1], color = id_df["label"],
          width = 1000,
          height = 600
          )

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
                                          ticks="outside"))

Plots¶

In [148]:
import plotly.express as px

images = [loader_test.get_image_array(i) for i in range(10)]

fig = px.imshow(
    np.array(images),
    facet_col=0,
    binary_string=True,
    facet_col_wrap=4,
    height=500,width=500,
    facet_col_spacing=0,
    facet_row_spacing=0,
    )

fig
In [152]:
 
In [230]:
def ceildiv(a, b):
    return -(a // -b)
In [252]:
import plotly.graph_objs as go
import plotly.offline as py

import pandas as pd
import numpy as np
from ipywidgets import interactive, HBox, VBox

py.init_notebook_mode()

df = pd.read_csv('https://raw.githubusercontent.com/jonmmease/plotly_ipywidget_notebooks/master/notebooks/data/cars/cars.csv')

f = go.FigureWidget([go.Scatter(y = df['City mpg'], x = df['City mpg'], mode = 'markers')])
scatter = f.data[0]
N = len(df)
scatter.x = scatter.x + np.random.rand(N)/10 *(df['City mpg'].max() - df['City mpg'].min())
scatter.y = scatter.y + np.random.rand(N)/10 *(df['City mpg'].max() - df['City mpg'].min())
scatter.marker.opacity = 0.5

# Create a table FigureWidget that updates on selection from points in the scatter plot of f
im_kwargs = dict(
    facet_col=0,
    binary_string=True,
    facet_col_wrap=5,
    height=800,
    width=1200,
    facet_col_spacing=0,
    facet_row_spacing=0,
)
t = px.imshow(
    images_train[:20],
    **im_kwargs,
    )

t  = go.FigureWidget(t)
import matplotlib.pyplot as plt
import seaborn as sns

def selection_fn(trace,points,selector):
    with t.batch_update():
        idxs = np.random.choice(range(100),size = 20, replace = False)
        #new = go.FigureWidget(imshow(images_train[idxs]))       
        kws = {**im_kwargs, **{"height":300*ceildiv(len(idxs),5)}}
        new = go.FigureWidget(px.imshow(images_train[idxs], **kws))
        t.update({'data':new.data,'layout':new.layout,"frames":new.frames}, overwrite = True)
        #for i in range(len(t.data)):    
        #    t.data[i] = data[i]

scatter.on_selection(selection_fn)

# Put everything together
VBox((f,t))
VBox(children=(FigureWidget({
    'data': [{'marker': {'opacity': 0.5},
              'mode': 'markers',
     …
In [227]:
 
In [229]:
ceildiv(11,3)
Out[229]:
4
In [ ]: